{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Big data? 🤗 Datasets ao resgate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install zstandard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['meta', 'text'],\n",
       "    num_rows: 15518009\n",
       "})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# This takes a few minutes to run, so go grab a tea or coffee while you wait :)\n",
    "data_files = \"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst\"\n",
    "pubmed_dataset = load_dataset(\"json\", data_files=data_files, split=\"train\")\n",
    "pubmed_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'meta': {'pmid': 11409574, 'language': 'eng'},\n",
       " 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pubmed_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install psutil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RAM used: 5678.33 MB"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import psutil\n",
    "\n",
    "# Process.memory_info is expressed in bytes, so convert to megabytes\n",
    "print(f\"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Number of files in dataset : 20979437051\n",
       "Dataset size (cache file) : 19.54 GB"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(f\"Number of files in dataset : {pubmed_dataset.dataset_size}\")\n",
    "size_gb = pubmed_dataset.dataset_size / (1024**3)\n",
    "print(f\"Dataset size (cache file) : {size_gb:.2f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Iterated over 15518009 examples (about 19.5 GB) in 64.2s, i.e. 0.304 GB/s'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import timeit\n",
    "\n",
    "code_snippet = \"\"\"batch_size = 1000\n",
    "\n",
    "for idx in range(0, len(pubmed_dataset), batch_size):\n",
    "    _ = pubmed_dataset[idx:idx + batch_size]\n",
    "\"\"\"\n",
    "\n",
    "time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())\n",
    "print(\n",
    "    f\"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in \"\n",
    "    f\"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pubmed_dataset_streamed = load_dataset(\n",
    "    \"json\", data_files=data_files, split=\"train\", streaming=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'meta': {'pmid': 11409574, 'language': 'eng'},\n",
       " 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(iter(pubmed_dataset_streamed))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 4958, 5178, 4328, 6779, ...], 'attention_mask': [1, 1, 1, 1, 1, ...]}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
    "tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x[\"text\"]))\n",
    "next(iter(tokenized_dataset))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'meta': {'pmid': 11410799, 'language': 'eng'},\n",
       " 'text': 'Randomized study of dose or schedule modification of granulocyte colony-stimulating factor in platinum-based chemotherapy for elderly patients with lung cancer ...'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)\n",
    "next(iter(shuffled_dataset))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'meta': {'pmid': 11409574, 'language': 'eng'},\n",
       "  'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'},\n",
       " {'meta': {'pmid': 11409575, 'language': 'eng'},\n",
       "  'text': 'Clinical signs of hypoxaemia in children with acute lower respiratory infection: indicators of oxygen therapy ...'},\n",
       " {'meta': {'pmid': 11409576, 'language': 'eng'},\n",
       "  'text': \"Hypoxaemia in children with severe pneumonia in Papua New Guinea ...\"},\n",
       " {'meta': {'pmid': 11409577, 'language': 'eng'},\n",
       "  'text': 'Oxygen concentrators and cylinders ...'},\n",
       " {'meta': {'pmid': 11409578, 'language': 'eng'},\n",
       "  'text': 'Oxygen supply in rural africa: a personal experience ...'}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_head = pubmed_dataset_streamed.take(5)\n",
    "list(dataset_head)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Skip the first 1,000 examples and include the rest in the training set\n",
    "train_dataset = shuffled_dataset.skip(1000)\n",
    "# Take the first 1,000 examples for the validation set\n",
    "validation_dataset = shuffled_dataset.take(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'meta': {'case_ID': '110921.json',\n",
       "  'case_jurisdiction': 'scotus.tar.gz',\n",
       "  'date_created': '2010-04-28T17:12:49Z'},\n",
       " 'text': '\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "law_dataset_streamed = load_dataset(\n",
    "    \"json\",\n",
    "    data_files=\"https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst\",\n",
    "    split=\"train\",\n",
    "    streaming=True,\n",
    ")\n",
    "next(iter(law_dataset_streamed))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'meta': {'pmid': 11409574, 'language': 'eng'},\n",
       "  'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'},\n",
       " {'meta': {'case_ID': '110921.json',\n",
       "   'case_jurisdiction': 'scotus.tar.gz',\n",
       "   'date_created': '2010-04-28T17:12:49Z'},\n",
       "  'text': '\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from itertools import islice\n",
    "from datasets import interleave_datasets\n",
    "\n",
    "combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed])\n",
    "list(islice(combined_dataset, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'meta': {'pile_set_name': 'Pile-CC'},\n",
       " 'text': 'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web...'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_url = \"https://the-eye.eu/public/AI/pile/\"\n",
    "data_files = {\n",
    "    \"train\": [base_url + \"train/\" + f\"{idx:02d}.jsonl.zst\" for idx in range(30)],\n",
    "    \"validation\": base_url + \"val.jsonl.zst\",\n",
    "    \"test\": base_url + \"test.jsonl.zst\",\n",
    "}\n",
    "pile_dataset = load_dataset(\"json\", data_files=data_files, streaming=True)\n",
    "next(iter(pile_dataset[\"train\"]))"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Big data? 🤗 Datasets ao resgate",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
